1 Introduction

This notebook accompanies the user guide “Model Comparison and Calibration Assessment” on arXiv.

The code is similar to the one used in the user guide. Please refer to it for explanations.

Note that the results might vary depending on the R package versions.

2 Regression example

The regression example is based on the “Workers Compensation” claims dataset on OpenML. We will model the ultimate claim amount of claims based on their initial case reserves and other claim features.

2.1 Attach all packages and fetch data

The dataset is being downloaded and then stored on disk for reuse.

library(tidyverse)
library(lubridate)
library(hexbin)
library(MetricsWeighted)
library(splines)
library(xgboost)
library(arrow)
library(ggcorrplot)
library(patchwork)
library(scales)
library(glm2)
library(withr)

# Workers Compensation dataset, see
# https://www.openml.org/d/42876
if (!file.exists("workers_compensation.parquet")) {
  library(OpenML)

  df_origin <- getOMLDataSet(data.id = 42876L)
  df <- tibble(df_origin$data)
  write_parquet(df, "workers_compensation.parquet")
} else {
  df <- read_parquet("workers_compensation.parquet")
}

2.2 Data preparation

# Note: WeekDayOfAccident: 1 means Monday
# Note: We filter out rows with WeeklyPay < 200
df <- df %>% 
  filter(WeeklyPay >= 200, HoursWorkedPerWeek >= 20) %>% 
  mutate(
    DateTimeOfAccident = ymd_hms(DateTimeOfAccident),
    DateOfAccident = as_date(DateTimeOfAccident),
    DateReported = ymd(DateReported),
    LogDelay = log1p(as.numeric(DateReported - DateOfAccident)),
    HourOfAccident = hour(DateTimeOfAccident),
    WeekDayOfAccident = factor(wday(DateOfAccident, week_start = 1)), 
    LogWeeklyPay = log1p(WeeklyPay),
    LogInitial = log(InitialCaseEstimate),
 #   DependentsOther = as.numeric(DependentsOther >= 1),
    DependentChildren = pmin(4, DependentChildren),
    HoursWorkedPerWeek = pmin(60, HoursWorkedPerWeek)
 ) %>% 
  mutate_at(c("Gender", "MaritalStatus", "PartTimeFullTime"), as.factor) %>% 
  rename(HoursPerWeek = HoursWorkedPerWeek)

x_continuous <- c("Age", "LogWeeklyPay", "LogInitial", "HourOfAccident",
                 "HoursPerWeek", "LogDelay")
x_discrete <- c("Gender", "MaritalStatus", "PartTimeFullTime",
                "DependentChildren", "DaysWorkedPerWeek", "WeekDayOfAccident")
x_vars <- c(x_continuous, x_discrete)
y_var <- "UltimateIncurredClaimCost"

df %>%
  select(all_of(y_var), all_of(x_vars)) %>%
  print(n = 10, width = 80)
## # A tibble: 82,017 x 13
##    UltimateIncurredCl…   Age LogWeeklyPay LogInitial HourOfAccident HoursPerWeek
##                  <dbl> <dbl>        <dbl>      <dbl>          <int>        <dbl>
##  1                102.    45         6.22       9.16              9           38
##  2               1451     40         5.65       8.01             15           38
##  3                320.    50         6.25       6.91              7           38
##  4                108     19         5.30       4.70             14           38
##  5               7111.    19         6.64       9.18             14           40
##  6               8379.    21         5.30       9.16             12           37
##  7              25338.    50         6.79      11.2               0           38
##  8                354     25         5.83       7.70              8           35
##  9               2269.    20         5.51       8.01             11           40
## 10             129367.    43         7.55       9.16             16           40
## # … with 82,007 more rows, and 7 more variables: LogDelay <dbl>, Gender <fct>,
## #   MaritalStatus <fct>, PartTimeFullTime <fct>, DependentChildren <dbl>,
## #   DaysWorkedPerWeek <dbl>, WeekDayOfAccident <fct>

2.3 Exploratory data analysis

2.3.1 Univariate description of target

df_values <- data.frame(
  functional = c("mean", "median"),
  value = c(mean(df[[y_var]]), median(df[[y_var]]))
)

ggplot(df, aes(x = UltimateIncurredClaimCost)) +
  geom_histogram(aes(y = ..density..), bins = 200, fill = "#E69F00") +
  geom_vline(
    data = df_values, 
    aes(xintercept = value, color = functional),
    linetype = "dashed", 
    size = 1
  ) +
  scale_x_log10() +
  xlab("Log(UltimateIncurredClaimCost)") +
  ggtitle("Histogram of UltimateIncurredClaimCost")

2.3.2 Univariate description

df %>%
  select_at(x_continuous) %>%
  pivot_longer(everything()) %>%
  mutate(name = factor(name, levels = x_continuous)) %>%
ggplot(aes(x = value)) +
  geom_histogram(bins = 19, fill = "#E69F00") +
  facet_wrap(~name, scales = "free", ncol = 3) +
  labs(y = element_blank()) +
  theme(
    axis.title.y = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank()
  ) +
  ggtitle("Histograms of numerical features")

df %>%
  select_at(x_discrete) %>%
  mutate_all(as.character) %>% 
  pivot_longer(everything()) %>%
  mutate(
    name = factor(name, levels = x_discrete),
    value = factor(value)
  ) %>%
ggplot(aes(x = value)) +
  geom_bar(fill = "#E69F00") +
  facet_wrap(~name, scales = "free", ncol = 3) +
  labs(y = "Count") +
  ggtitle("Histograms of categorical features")

2.3.3 Correlations across continuous covariates

df %>%
  select_at(x_continuous) %>%
  cor() %>%
  round(2) %>%
  ggcorrplot(
    hc.order = FALSE,
    type = "upper",
    outline.col = "white",
    ggtheme = theme_minimal(),
    colors = c("#6D9EC1", "white", "#E46726")
  )

2.3.4 Response in dependence of covariates

df_cat <- df %>% 
  select(all_of(x_discrete), all_of(y_var)) %>%
  mutate(across(-all_of(y_var), as.factor)) %>% 
  pivot_longer(cols = -all_of(y_var))

ggplot(df_cat, aes_string("value", y_var)) +
  geom_boxplot(varwidth = TRUE, fill = "orange") +
  facet_wrap(~ name, scales = "free_x") +
  scale_y_log10() +
  xlab(element_blank()) +
  ggtitle("Boxplots for categorical features")

df_num <- df %>% 
  select(all_of(x_continuous), all_of(y_var)) %>% 
  pivot_longer(cols = -all_of(y_var))

ggplot(df_num, aes_string("value", y = y_var)) +
  geom_hex(bins = 18, show.legend = FALSE) +
  facet_wrap(~ name, scales = "free") +
  scale_y_log10() +
  scale_fill_viridis_c(option = "magma", trans = "log10") +
  xlab(element_blank())

2.4 Data split

Next, we split our dataset into training and test data.

set.seed(1234321L)
.in <- sample(nrow(df), round(0.75 * nrow(df)), replace = FALSE)
train <- df[.in, ]
test <- df[-.in, ]
y_train <- train[[y_var]]
y_test <- test[[y_var]]

df <- df %>% 
  mutate(dataset = factor(c("test"), c("train", "test")))
df$dataset[.in] <- "train"

df %>% 
  group_by(dataset) %>% 
  summarise(
    mean = mean(UltimateIncurredClaimCost),
    q20 = quantile(UltimateIncurredClaimCost, probs=0.2),
    q40 = quantile(UltimateIncurredClaimCost, probs=0.4),
    q50 = median(UltimateIncurredClaimCost),
    q60 = quantile(UltimateIncurredClaimCost, probs=0.6),
    q80 = quantile(UltimateIncurredClaimCost, probs=0.8),
    q90 = quantile(UltimateIncurredClaimCost, probs=0.9)
  )

2.5 The models

2.5.1 Trivial Model

trivial_predict <- function(X) {
  mean(y_train) * rep(1, dim(X)[1])
}

2.5.2 OLS on log response

form <- reformulate(x_vars, y_var)
fit_ols <- lm(form, data = train)
summary(fit_ols)
## 
## Call:
## lm(formula = form, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
##  -59570  -17197   -7636    1715 3098226 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -94262.309   4289.601 -21.975  < 2e-16 ***
## Age                   229.202     21.032  10.898  < 2e-16 ***
## LogWeeklyPay         6641.731    484.474  13.709  < 2e-16 ***
## LogInitial           7141.915    141.047  50.635  < 2e-16 ***
## HourOfAccident        -81.457     58.319  -1.397 0.162493    
## HoursPerWeek           -8.964     52.903  -0.169 0.865449    
## LogDelay             1388.656    265.576   5.229 1.71e-07 ***
## GenderM             -3241.978    563.400  -5.754 8.74e-09 ***
## MaritalStatusS      -1214.311    540.533  -2.247 0.024675 *  
## MaritalStatusU       -887.372    828.146  -1.072 0.283941    
## PartTimeFullTimeP    3254.703   1120.526   2.905 0.003678 ** 
## DependentChildren    1169.777    452.950   2.583 0.009809 ** 
## DaysWorkedPerWeek    1101.599    623.100   1.768 0.077077 .  
## WeekDayOfAccident2  -2709.643    740.163  -3.661 0.000252 ***
## WeekDayOfAccident3  -3303.039    736.864  -4.483 7.39e-06 ***
## WeekDayOfAccident4  -3560.326    739.063  -4.817 1.46e-06 ***
## WeekDayOfAccident5  -3909.812    739.677  -5.286 1.26e-07 ***
## WeekDayOfAccident6  -1306.407   1021.934  -1.278 0.201125    
## WeekDayOfAccident7  -3696.886   1240.245  -2.981 0.002876 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 54970 on 61494 degrees of freedom
## Multiple R-squared:  0.06627,    Adjusted R-squared:  0.066 
## F-statistic: 242.5 on 18 and 61494 DF,  p-value: < 2.2e-16
r_squared(y_test, predict(fit_ols, test), reference_mean = mean(y_train))  
## [1] 0.06800488
# r_squared_gamma(y_test, predict(fit_ols, test), reference_mean = mean(y_train))
# Error because of negative predictions

fit_ols_log <- lm(update(form, log(UltimateIncurredClaimCost) ~ .), data = train)

ols_predict <- function(X){
  exp(predict(fit_ols_log, X))
}

# Same with bias correction
corr_fact_ols <- mean(y_train) / mean(exp(fitted(fit_ols_log)))

ols_corr_predict <- function(X) {
  corr_fact_ols * exp(predict(fit_ols_log, X))
}

2.5.3 Gamma GLM

# Note: Standard glm(..) raises
#   Error in glm.fit(x = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  : 
#   NA/NaN/Inf in 'x'
# Therefore, we use glm2 which is more stable.
fit_glm_gamma <- glm2(reformulate(x_vars, y_var), data = train, 
                      family = Gamma(link = "log"))
summary(fit_glm_gamma)
## 
## Call:
## glm2(formula = reformulate(x_vars, y_var), family = Gamma(link = "log"), 
##     data = train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -3.763  -1.957  -1.499  -0.854  51.614  
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -0.028350   0.613185  -0.046 0.963125    
## Age                 0.021583   0.003007   7.179 7.11e-13 ***
## LogWeeklyPay        0.670881   0.069254   9.687  < 2e-16 ***
## LogInitial          0.482337   0.020162  23.923  < 2e-16 ***
## HourOfAccident      0.006094   0.008336   0.731 0.464767    
## HoursPerWeek        0.001041   0.007562   0.138 0.890509    
## LogDelay            0.134833   0.037963   3.552 0.000383 ***
## GenderM            -0.347941   0.080536  -4.320 1.56e-05 ***
## MaritalStatusS     -0.153744   0.077268  -1.990 0.046622 *  
## MaritalStatusU      0.122295   0.118381   1.033 0.301579    
## PartTimeFullTimeP   0.204716   0.160176   1.278 0.201230    
## DependentChildren   0.152436   0.064748   2.354 0.018561 *  
## DaysWorkedPerWeek   0.030446   0.089070   0.342 0.732487    
## WeekDayOfAccident2 -0.092122   0.105804  -0.871 0.383928    
## WeekDayOfAccident3 -0.277486   0.105332  -2.634 0.008431 ** 
## WeekDayOfAccident4 -0.254000   0.105647  -2.404 0.016209 *  
## WeekDayOfAccident5 -0.258506   0.105735  -2.445 0.014494 *  
## WeekDayOfAccident6 -0.198524   0.146082  -1.359 0.174156    
## WeekDayOfAccident7 -0.093022   0.177289  -0.525 0.599802    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Gamma family taken to be 61.75326)
## 
##     Null deviance: 312399  on 61512  degrees of freedom
## Residual deviance: 225203  on 61494  degrees of freedom
## AIC: 1138728
## 
## Number of Fisher Scoring iterations: 12
glm_gamma_predict <- function(X) {
  predict(fit_glm_gamma, X, type = "response")
}

# Same with bias correction
corr_fact_glm <- mean(y_train) / mean(glm_gamma_predict(train)) 

glm_gamma_corr_predict <- function(X){
  corr_fact_glm * predict(fit_glm_gamma, X, type = "response")
}

2.5.4 Poisson GLM

# Quasi-Poisson instead of Poisson suppresses warnings for non-integer y
fit_glm_poisson <- glm(
  reformulate(x_vars, y_var), data = train, family = quasipoisson
)
summary(fit_glm_poisson)
## 
## Call:
## glm(formula = reformulate(x_vars, y_var), family = quasipoisson, 
##     data = train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -948.6  -116.2   -65.4   -35.7  5171.1  
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         0.381284   0.294657   1.294 0.195672    
## Age                 0.013423   0.001475   9.098  < 2e-16 ***
## LogWeeklyPay        0.501197   0.036604  13.692  < 2e-16 ***
## LogInitial          0.655883   0.012763  51.390  < 2e-16 ***
## HourOfAccident      0.002261   0.004030   0.561 0.574840    
## HoursPerWeek       -0.004563   0.003310  -1.379 0.167981    
## LogDelay            0.057101   0.019625   2.910 0.003619 ** 
## GenderM            -0.255144   0.039523  -6.456 1.09e-10 ***
## MaritalStatusS     -0.176509   0.039997  -4.413 1.02e-05 ***
## MaritalStatusU     -0.106357   0.053720  -1.980 0.047726 *  
## PartTimeFullTimeP   0.122453   0.072032   1.700 0.089138 .  
## DependentChildren   0.053915   0.028606   1.885 0.059466 .  
## DaysWorkedPerWeek   0.026961   0.038600   0.698 0.484880    
## WeekDayOfAccident2 -0.193843   0.053965  -3.592 0.000328 ***
## WeekDayOfAccident3 -0.233027   0.054652  -4.264 2.01e-05 ***
## WeekDayOfAccident4 -0.262282   0.054996  -4.769 1.85e-06 ***
## WeekDayOfAccident5 -0.274203   0.055754  -4.918 8.76e-07 ***
## WeekDayOfAccident6 -0.098053   0.073250  -1.339 0.180700    
## WeekDayOfAccident7 -0.263446   0.095347  -2.763 0.005728 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 228093.6)
## 
##     Null deviance: 3504504976  on 61512  degrees of freedom
## Residual deviance: 2421145448  on 61494  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 7
glm_poisson_predict <- function(X){
  predict(fit_glm_poisson, X, type = "response")
}

2.5.5 XGBoost

Resonable choices for XGBoost’s hyperparameters are provided in the grid search table stored as “grid/grid_xgb.RData”. Thus, there is no need for retuning those parameters.

# Data interface
dtrain <- xgb.DMatrix(data.matrix(train[, x_vars]), label = y_train)

# Settings
tune <- FALSE
file_grid <- "grid/grid_xgb.RData"

if (tune) {
  # Step 1: find good learning rate
  xgb.cv(
    list(learning_rate = 0.03),
    dtrain,
    nrounds = 5000,
    tree_method = "hist",
    nfold = 5,
    objective = "reg:gamma",
    showsd = FALSE,
    early_stopping_rounds = 20,
    verbose = 2
  )
  
  # Step 2: Grid search CV on typical parameter combos
  paramGrid <- expand.grid(
    iteration = NA,
    score = NA,
    learning_rate = 0.03,
    max_depth = 4:6,
    min_child_weight = c(0, 1e-04),
    colsample_bynode = c(0.8, 1),
    subsample = c(0.8, 1),
    reg_lambda = 0:2,
    reg_alpha = 0:2,
    tree_method = "hist",
    eval_metric = "gamma-nloglik"
  )
  if (nrow(paramGrid) > 20) {
    set.seed(342267)
    paramGrid <- paramGrid[sample(nrow(paramGrid), 20), ]
  }
  
  for (i in seq_len(nrow(paramGrid))) {
    print(i)
    cvm <- xgb.cv(
      as.list(paramGrid[i, -(1:2)]),
      dtrain,
      nrounds = 5000,
      nfold = 5,
      objective = "reg:gamma",
      showsd = FALSE,
      early_stopping_rounds = 20,
      verbose = 0
    )
    
    paramGrid[i, 1] <- bi <- cvm$best_iteration
    paramGrid[i, 2] <- cvm$evaluation_log[bi, "test_gamma_nloglik_mean"] %>% 
      as.numeric()
    save(paramGrid, file = file_grid)
  }
}
load(file_grid, verbose = TRUE)
## Loading objects:
##   paramGrid
# Step 3: Fit on best params
head(paramGrid <- paramGrid[order(paramGrid$score), ])
params <- paramGrid[1, ]

set.seed(93845)
fit_xgb <- xgb.train(
  as.list(params[, -(1:2)]),
  data = dtrain,
  nrounds = params$iteration,
  objective = "reg:gamma"
)

# Predict wrapper for xgb model
xgb_predict <- function(newdata, x = x_vars) {
  predict(fit_xgb, data.matrix(newdata[, x]))
}

# Same with bias correction
corr_fact_xgb <- mean(y_train) / mean(xgb_predict(train))

xgb_corr_predict <- function(newdata, x = x_vars) {
  corr_fact_xgb * predict(fit_xgb, data.matrix(newdata[, x]))
}

# feature importance, helps to select features for calibration analysis
importanceRaw <- xgb.importance(model = fit_xgb)
xgb.plot.importance(importance_matrix = importanceRaw)

2.6 Calibration assessment

2.6.1 Unconditional calibration in terms of bias

model_levels <- c("trivial", "ols", "ols_corr", "glm_gamma",
                  "glm_gamma_corr", "glm_poisson", "xgb", "xgb_corr")

df_cali <- df %>% 
  mutate(
    prediction_trivial = trivial_predict(.),
    prediction_ols = ols_predict(.),
    prediction_ols_corr = ols_corr_predict(.),
    prediction_glm_gamma = glm_gamma_predict(.),
    prediction_glm_gamma_corr = glm_gamma_corr_predict(.),
    prediction_glm_poisson = glm_poisson_predict(.),
    prediction_xgb = xgb_predict(.),
    prediction_xgb_corr = xgb_corr_predict(.)
  ) %>% 
  pivot_longer(
    cols = starts_with("prediction"),
    names_to = "model",
    names_prefix = "prediction_",
    values_to = "prediction"
  ) %>% 
  mutate(
    bias = prediction - UltimateIncurredClaimCost,
    model = factor(model, model_levels)
  )

# Unconditional calibration
df_cali %>% 
  group_by(dataset, model) %>% 
  summarise(
    mean_bias = num(mean(bias), digits = 0),
    p_value = num(t.test(bias)$p.value, sigfig = 2),
    .groups = "drop"
  )

2.6.2 Auto-calibration

Visualized differently on train and test.

df_cali %>% 
  filter(dataset == "train") %>%
# sample_n(10000) %>% 
ggplot(aes(x = prediction, y = bias)) +
  geom_hex(bins = 23, show.legend = TRUE) +
  geom_smooth(
    method = "gam", 
    formula = y ~ s(x, bs = "cr", k = 20), 
    se = FALSE
  ) +
  scale_fill_viridis_c(option = "magma", trans = "log10") +
  facet_wrap(~ model) +
  theme(legend.position = c(0.9, 0), legend.justification = c(1, 0)) +
  ggtitle("Bias (negative residuals) vs predicted (training set)")

# Smoothed curve on test
df_cali %>% 
  filter(dataset == "test", prediction <= 1e5) %>% 
ggplot(aes(x = prediction, y = bias, linetype = model, color = model)) +
  geom_smooth(
    method = "gam", 
    formula = y ~ s(x, bs = "cr", k = 20), 
    se = FALSE
  ) +
  coord_cartesian(ylim = c(-2e4, 2e4)) +
  ggtitle("Bias (negative residuals) vs predicted (test set)")

# Table for test function phi = prediction
df_cali %>% 
  group_by(dataset, model) %>%
  summarise(
    bias = num(mean(prediction * bias), sigfig = 4, notation = "sci"),
    .groups = "drop"
  ) %>% 
  pivot_wider(names_from = dataset, values_from = bias)

2.6.3 Calibration conditional on Gender

# Note: We want to divide by n_train and n_test and NOT by n(Gender=M) and n(Gender=F)
df_cali %>% 
  group_by(dataset, model) %>%
  summarise(
    F = mean(bias * (Gender == "F")),
    M = mean(bias * (Gender == "M")),
    .groups = "drop"
  )

Bar plot of mean bias on the test set. The first plot for the two test functions \(\phi(x) = 1\{Gender = F\}\) and \(\phi(x) = 1\{Gender = M\}\) and the second one for evaluation of \(\bar V\) stratified by gender.

df_indicator <- df_cali %>%
  filter(dataset == "test") %>% 
  group_by(model) %>%
  summarise(
    F = mean(bias * (Gender == "F")),
    M = mean(bias * (Gender == "M")),
    .groups = "drop"
  ) %>%
  pivot_longer(c("F", "M"), names_to = "Gender", values_to = "mean bias") %>% 
  mutate(type = "whole sample")

df_strat <- df_cali %>% 
  filter(dataset == "test") %>% 
  group_by(model, Gender) %>% 
  summarise(`mean bias` = mean(bias), .groups = "drop") %>% 
  mutate(type = "subsample per Gender")

df_indicator %>% 
  bind_rows(df_strat) %>%
  ggplot(aes(x = Gender, y = `mean bias`, group = model, color = model)) +
  geom_point(size = 3, position = position_dodge(width = 0.3), alpha = 0.7) +
  facet_wrap(~ fct_rev(type)) +
  ggtitle("Mean bias by Gender (test set)")

2.6.4 Calibration conditional on LogWeeklyPay

LogWeeklyPay is the second most important feature after the initial claim reserve estimate.

# Binning of LogWeeklyPay in 10 quantiles
# Use mean of quantile as x-value
breaks <- unique(quantile(df$LogWeeklyPay, (0:10) / 10))
midpoints <- (breaks[-length(breaks)] + breaks[-1]) / 2

df_binned <- df_cali %>% 
  mutate(
    LogWeeklyPayBinned = midpoints[
      cut(LogWeeklyPay, breaks, include.lowest = TRUE, labels = FALSE)
    ],
    WeeklyPayBinned = exp(LogWeeklyPayBinned) - 1
  )

p1 <- df_binned %>% 
  group_by(dataset, model, WeeklyPayBinned) %>%
  summarise(bias = sum(bias), .groups = "drop") %>% 
  mutate(
    bias = bias / case_when(
      dataset=="train" ~ nrow(train),
      dataset=="test" ~ nrow(test)
    )
  ) %>% 
ggplot(aes(x = WeeklyPayBinned, y = bias, color = model, group = model)) +
  geom_line() +
  geom_point() +
  scale_x_log10() +
  facet_wrap(~ dataset) +
  ylab("mean bias") +
  ggtitle("Mean bias on whole sample") +
  theme(plot.title = element_text(size = 12))

p2 <- df_binned %>% 
  group_by(dataset, model, WeeklyPayBinned) %>%
  summarise(bias = mean(bias), .groups = "drop") %>% 
ggplot(aes(x = WeeklyPayBinned, y = bias, color = model, group = model)) +
  geom_line() +
  geom_point() +
  scale_x_log10() +
  facet_wrap(~ dataset) +
  ylab("mean bias") +
  ggtitle("Mean bias per sample in bin") +
  theme(plot.title = element_text(size = 12))

p1 / p2 +
  plot_layout(guides = "collect") +
  plot_annotation(title = "Mean bias on binned LogWeeklyPay")

# Test function = LogWeeklyPay
with_options(
  list(scipen = 3, pillar.sigfig = 0),
  df_cali %>% 
    group_by(dataset, model) %>%
    summarise(calib_test = mean(LogWeeklyPay * bias), .groups = "drop") %>% 
    pivot_wider(names_from = dataset, values_from = calib_test)
)

2.7 Model comparison

2.7.1 Compare absolute and relative performance scores

# Function returns two relevant mean Gamma deviance and corresponding D2
perf <- function(X, actual, predicted, ref) {
  act <- X[[actual]]
  pred <- X[[predicted]]
  tibble(
    Measure = c("Mean deviance", "D-Squared"),
    Score = c(`Mean deviance` = deviance_gamma(act, pred), 
              `D-Squared` = r_squared_gamma(act, pred, reference_mean = ref))
  )
}

# Apply it to combinations of dataset and model
df_perf <- df_cali %>% 
  group_by(dataset, model) %>% 
  summarize(
    perf(cur_data(), y_var, "prediction", mean(y_train)), 
    .groups = "drop"
  )

df_perf %>% 
  pivot_wider(
    "model", 
    names_from = c("Measure", "dataset"), 
    values_from = "Score", names_sort = TRUE
  )
ggplot(df_perf, aes(y = Score, x = dataset, color = model, group = model)) +
  geom_point(size = 2) +
  geom_line() +
  facet_wrap(~ Measure, scales = "free")

2.7.2 Murphy diagram (for elementary score)

df_murphy <- df_cali %>% 
  filter(!(model %in% c("trivial", "ols"))) %>% 
  group_by(dataset, model) %>% 
  summarize(
    murphy_diagram(
      cur_data()[[y_var]], 
      cur_data()$prediction, 
      plot = FALSE, 
      theta = exp(seq(log(10000), log(1.7e5), by = 0.02))
    ),
    .groups = "drop"
  ) %>% 
  rename(Score = predicted)

# Vertical line is the mean y on the training data (for both graphs)
ggplot(df_murphy, aes(y = Score, x = theta)) +
  geom_line(aes(color = model, linetype = model, group = model), size = 0.75) +
  geom_vline(xintercept = mean(train[[y_var]]), linetype = 2, size = 0.75) +
  facet_wrap(~ dataset, scales = "free", ncol = 2) +
  scale_x_log10() +
  theme(legend.title = element_blank())

2.7.3 Murphy-type diagram (for Tweedie p)

df_tweedie <- df_cali %>% 
  filter(!(model %in% c("trivial", "ols"))) %>% 
  group_by(dataset, model) %>% 
  summarize(
    performance(
      cur_data(), 
      actual = y_var, 
      predicted = "prediction", 
      metrics = multi_metric(deviance_tweedie, tweedie_p = seq(1, 3, by = 0.05)), 
      key = "Tweedie_p", 
      value = "Deviance"
    ),
    .groups = "drop"
  ) %>% 
  mutate(
    Tweedie_p = as.numeric(as.character(Tweedie_p)),
    Deviance_rescaled = Deviance * mean(df_cali[[y_var]])^(Tweedie_p-2)
  )

ggplot(df_tweedie, aes(y = Deviance_rescaled, x = Tweedie_p)) +
  geom_line(aes(color = model, linetype = model, group = model), size = 0.75) +
  facet_wrap(~ dataset, scales = "free", ncol = 2) +
  scale_y_log10() +
  theme(legend.title = element_blank())

3 Binary Classification Example

The classification example is based on the “Telco Customer Churn” dataset on OpenML. We will model churn probability as a function of a couple of features.

3.1 Attach all packages and fetch data

The dataset is being downloaded and then stored on disk for reuse.

library(tidyverse)
library(lubridate)
library(hexbin)
library(MetricsWeighted)
library(ranger)
library(xgboost)
library(arrow)
library(ggcorrplot)
library(patchwork)
library(scales)
library(withr)
library(splitTools)
library(reliabilitydiag)

if (!file.exists("churn.parquet")) {
  library(OpenML)

  df_origin <- getOMLDataSet(data.id = 42178)
  df <- tibble(df_origin$data)
  write_parquet(df, "churn.parquet")
} else {
  df <- read_parquet("churn.parquet")
}

3.2 Data preparation

df[df == "No internet service" | df == "No phone service"] <- "No"

df <- df %>%
  mutate(
    LogTotalCharges = log(as.numeric(TotalCharges)),
    Churn = (Churn == "Yes") + 0,
  ) %>% 
  replace_na(
    list(LogTotalCharges = median(.$LogTotalCharges, na.rm = TRUE))
  ) %>% 
  mutate_if(is.character, as.factor)

y_var <- "Churn"
x_continuous <- c("tenure", "MonthlyCharges", "LogTotalCharges")
x_discrete <- setdiff(
  colnames(select_if(df, is.factor)), c("customerID", y_var, "TotalCharges")
)
x_vars <- c(x_continuous, x_discrete)

df[c(y_var, x_vars)]

3.3 Exploratory data analysis

3.3.1 Univariate description of target

table(df[[y_var]], useNA = "ifany")
## 
##    0    1 
## 5174 1869

3.3.2 Univariate description

# Univariate description
df %>%
  select_at(x_continuous) %>%
  pivot_longer(everything()) %>%
  mutate(name = factor(name, levels = x_continuous)) %>%
ggplot(aes(x = value)) +
  geom_histogram(bins = 19, fill = "#E69F00") +
  facet_wrap(~name, scales = "free", ncol = 3) +
  labs(y = element_blank()) +
  theme(
    axis.title.y = element_blank(),
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank()
  ) +
  ggtitle("Histograms of numerical features")

df %>%
  select_at(x_discrete) %>%
  mutate_all(as.character) %>% 
  pivot_longer(everything()) %>%
  mutate(
    name = factor(name, levels = x_discrete),
    value = factor(value)
  ) %>%
ggplot(aes(x = value)) +
  geom_bar(fill = "#E69F00") +
  facet_wrap(~ name, scales = "free", ncol = 3) +
  labs(y = "Count") +
  ggtitle("Barplots of categorical features")

3.3.3 Correlations across continuous covariates

df %>%
  select_at(x_continuous) %>%
  cor() %>%
  round(2) %>%
  ggcorrplot(
    hc.order = FALSE,
    type = "upper",
    outline.col = "white",
    ggtheme = theme_minimal(),
    colors = c("#6D9EC1", "white", "#E46726")
  )

3.3.4 Response in dependence of covariates

# Response in dependence of covariates
df_cat <- df %>% 
  select(all_of(x_discrete), all_of(y_var)) %>%
  mutate(across(-all_of(y_var), as.factor)) %>% 
  pivot_longer(cols = -all_of(y_var)) %>% 
  group_by(name, value) %>% 
  summarize(Churn = mean(Churn), .groups="drop")

ggplot(df_cat, aes_string("value", y_var)) +
  geom_point(color = "orange", size = 3) +
  facet_wrap(~ name, scales = "free_x") +
  scale_y_log10() +
  xlab(element_blank()) +
  ggtitle("Mean churn per categorical level")

df_num <- df %>% 
  select(all_of(x_continuous), all_of(y_var)) %>% 
  pivot_longer(cols = -all_of(y_var))

ggplot(df_num, aes_string("value", y = y_var)) +
  geom_smooth() +
  facet_wrap(~ name, scales = "free") +
  xlab(element_blank())

3.4 Data split

Next, we split our dataset into training and test/validation data, stratified by the response.

set.seed(34621L)
inds <- partition(df$Churn, p = c(train = 0.75, test = 0.25))

train <- df[inds$train, ]
test <- df[inds$test, ]
y_train <- train[[y_var]]
y_test <- test[[y_var]]

df <- df %>% 
  mutate(dataset = factor(c("test"), c("train", "test")))
df$dataset[inds$train] <- "train"

df %>% 
  group_by(dataset) %>% 
  summarise(mean(Churn), .groups="drop")

3.5 The models

3.5.1 Trivial Model

trivial_predict <- function(X) {
  mean(y_train) * rep(1, dim(X)[1])
}

3.5.2 Logistic regression

form <- reformulate(x_vars, y_var)
fit_glm <- glm(form, data = train, family = binomial())
summary(fit_glm)
## 
## Call:
## glm(formula = form, family = binomial(), data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1015  -0.6929  -0.2972   0.6361   3.1984  
## 
## Coefficients:
##                                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                           2.7490537  0.9568960   2.873  0.00407 ** 
## tenure                               -0.0004815  0.0041139  -0.117  0.90683    
## MonthlyCharges                       -0.0078254  0.0362538  -0.216  0.82910    
## LogTotalCharges                      -0.5921601  0.0595243  -9.948  < 2e-16 ***
## genderMale                           -0.0342426  0.0748712  -0.457  0.64742    
## PartnerYes                            0.0435868  0.0900245   0.484  0.62827    
## DependentsYes                        -0.1256488  0.1035362  -1.214  0.22491    
## PhoneServiceYes                      -0.1201122  0.7424339  -0.162  0.87148    
## MultipleLinesYes                      0.4486445  0.2024268   2.216  0.02667 *  
## InternetServiceFiber optic            1.3643974  0.9119496   1.496  0.13462    
## InternetServiceNo                    -1.5244306  0.9250945  -1.648  0.09938 .  
## OnlineSecurityYes                    -0.2069737  0.2039200  -1.015  0.31012    
## OnlineBackupYes                      -0.0101351  0.2002946  -0.051  0.95964    
## DeviceProtectionYes                   0.1374344  0.2011316   0.683  0.49441    
## TechSupportYes                       -0.1697132  0.2058273  -0.825  0.40963    
## StreamingTVYes                        0.3873036  0.3734791   1.037  0.29973    
## StreamingMoviesYes                    0.4607985  0.3726992   1.236  0.21632    
## ContractOne year                     -0.7008268  0.1236184  -5.669 1.43e-08 ***
## ContractTwo year                     -1.7527819  0.2059230  -8.512  < 2e-16 ***
## PaperlessBillingYes                   0.3720130  0.0861707   4.317 1.58e-05 ***
## PaymentMethodCredit card (automatic) -0.1165994  0.1286154  -0.907  0.36463    
## PaymentMethodElectronic check         0.1808147  0.1089242   1.660  0.09691 .  
## PaymentMethodMailed check            -0.1469906  0.1342096  -1.095  0.27342    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6110.3  on 5280  degrees of freedom
## Residual deviance: 4388.3  on 5258  degrees of freedom
## AIC: 4434.3
## 
## Number of Fisher Scoring iterations: 6
r_squared_bernoulli(
  y_test, 
  predict(fit_glm, test, type = "response"), 
  reference_mean = mean(y_train)
)
## [1] 0.3399342

3.5.3 Random forest

fit_rf <- ranger(
  form, 
  data = train, 
  probability = TRUE, 
  seed = 774, 
  min.node.size = 30, 
  oob.error = TRUE
)
fit_rf
## Ranger result
## 
## Call:
##  ranger(form, data = train, probability = TRUE, seed = 774, min.node.size = 30,      oob.error = TRUE) 
## 
## Type:                             Probability estimation 
## Number of trees:                  500 
## Sample size:                      5281 
## Number of independent variables:  18 
## Mtry:                             4 
## Target node size:                 30 
## Variable importance mode:         none 
## Splitrule:                        gini 
## OOB prediction error (Brier s.):  0.1395934

3.5.4 XGBoost

Resonable choices for XGBoost’s hyperparameters are provided in the grid search table stored as “grid/c_grid_xgb.RData”. Thus, there is no need for retuning those parameters.

# Data interface
dtrain <- xgb.DMatrix(data.matrix(train[, x_vars]), label = y_train)

# Settings
tune <- FALSE
file_grid <- "grid/c_grid_xgb.RData"

if (tune) {
  # Step 1: find good learning rate
  xgb.cv(
    list(learning_rate = 0.01),
    dtrain,
    nrounds = 5000,
    nfold = 5,
    objective = "binary:logistic",
    showsd = FALSE,
    early_stopping_rounds = 20,
    verbose = 2
  )
  
  # Step 2: Grid search CV on typical parameter combos
  paramGrid <- expand.grid(
    iteration = NA,
    score = NA,
    learning_rate = 0.01,
    max_depth = 3:6,
    min_child_weight = c(0, 1e-04),
    colsample_bynode = c(0.8, 1),
    subsample = c(0.8, 1),
    reg_lambda = 0:2,
    reg_alpha = 0:2,
    eval_metric = "logloss"
  )
  if (nrow(paramGrid) > 20) {
    set.seed(342267)
    paramGrid <- paramGrid[sample(nrow(paramGrid), 20), ]
  }
  
  for (i in seq_len(nrow(paramGrid))) {
    print(i)
    cvm <- xgb.cv(
      as.list(paramGrid[i, -(1:2)]),
      dtrain,
      nrounds = 5000,
      nfold = 5,
      objective = "binary:logistic",
      showsd = FALSE,
      early_stopping_rounds = 20,
      verbose = 0
    )
    
    paramGrid[i, 1] <- bi <- cvm$best_iteration
    paramGrid[i, 2] <- cvm$evaluation_log[bi, "test_logloss_mean"] %>% 
      as.numeric()
    save(paramGrid, file = file_grid)
  }
}
load(file_grid, verbose = TRUE)
## Loading objects:
##   paramGrid
# Step 3: Fit on best params
head(paramGrid <- paramGrid[order(paramGrid$score), ])
params <- paramGrid[1, ]

set.seed(76)
fit_xgb <- xgb.train(
  as.list(params[, -(1:2)]),
  data = dtrain,
  nrounds = params$iteration,
  objective = "binary:logistic"
)

# Predict wrapper for xgb model
xgb_predict <- function(newdata, x = x_vars) {
  predict(fit_xgb, data.matrix(newdata[, x]))
}

# feature importance
importanceRaw <- xgb.importance(model = fit_xgb)
xgb.plot.importance(importance_matrix = importanceRaw)

3.6 Calibration assessment

3.6.1 Unconditional calibration in terms of bias

model_levels <- c("trivial", "logreg", "rf", "xgb")

df_cali <- df %>% 
  mutate(
    prediction_trivial = trivial_predict(.),
    prediction_logreg = predict(fit_glm, ., type = "response"),
    prediction_rf = predict(fit_rf, .)$predictions[, 2],
    prediction_xgb = xgb_predict(.)
  ) %>% 
  pivot_longer(
    cols = starts_with("prediction"),
    names_to = "model",
    names_prefix = "prediction_",
    values_to = "prediction"
  ) %>% 
  mutate(
    bias = prediction - Churn,
    model = factor(model, model_levels)
  )

# Unconditional calibration
df_cali %>% 
  group_by(dataset, model) %>% 
  summarise(Mean_bias = mean(bias), .groups = "drop") %>% 
  mutate(Mean_bias = zapsmall(Mean_bias)) %>% 
  pivot_wider(
    id_cols = "model", 
    names_from = "dataset", 
    values_from = "Mean_bias"
  )

3.6.2 Reliability diagram for assessment of auto-calibration

reldiag <- reliabilitydiag(
  logreg = filter(df_cali, model == "logreg", dataset == "test")$prediction,
  rf = filter(df_cali, model == "rf", dataset == "test")$prediction,
  xgb = filter(df_cali, model == "xgb", dataset == "test")$prediction,
  y = y_test,
  xtype = "continuous",
  region.level = NA
)

# Get decomposition of log loss score => does not work
log_loss_score <- function(obs, pred){
  ifelse(((obs==0 & pred==0) | (obs==1 & pred==1)),
         0,
         -obs * log(pred) - (1 - obs) * log(1 - pred))
}

# Score decomposition of log loss
log_loss_decomp_text <- summary(reldiag, score = "log_loss_score") %>% 
  mutate(
    text = paste0(
      "MCB = ", format(miscalibration, digits = 3), "\n",
      "DSC = ", format(discrimination, digits = 3), "\n",
      "UNC = ", format(uncertainty, digits = 3)
    )
  )

autoplot(
    reldiag,
    params_CEPline = list(size = 0.5),
    params_histogram = list(colour = "black", fill = NA)
  ) +
  facet_wrap("forecast") +
  ggtitle("Reliability diagrams (test set)") +
  guides(
    color = guide_legend(title = "model"), 
    linetype = guide_legend(title = "model")
  ) +
  geom_label(
    data = log_loss_decomp_text,
    mapping = aes(x = 0.0, y = 1, label = text),
    size = 3, 
    hjust = "left", 
    vjust = "top"
  )

3.7 Model comparison

3.7.1 Compare absolute and relative performance scores

# Function returns two relevant mean Gamma deviance and corresponding D2
# Note: Bernoulli deviance = 2 * logloss
perf <- function(X, actual, predicted, ref) {
  act <- X[[actual]]
  pred <- X[[predicted]]
  tibble(
    Score = c(
      `log loss` = logLoss(act, pred), 
      `D-Squared` = r_squared_bernoulli(act, pred, reference_mean = ref),
      AUC = AUC(act, pred)
    ),
  ) %>% 
  mutate(Measure = factor(names(Score), names(Score)))
}

# Apply it to combinations of dataset and model
df_perf <- df_cali %>% 
  group_by(dataset, model) %>% 
  summarize(
    perf(cur_data(), y_var, "prediction", mean(y_train)), 
    .groups = "drop"
  )

df_perf_for_text <- df_perf %>% 
  mutate(Score = num(Score, digits = 3)) %>% 
  pivot_wider(
    "model", 
    names_from = c("Measure", "dataset"), 
    values_from = "Score", 
    names_sort = TRUE
  )

df_perf %>% 
  filter(Measure != "AUC") %>% 
ggplot(aes(y = Score, x = dataset, color = model, group = model)) +
  geom_point(size = 2) +
  geom_line() +
  facet_wrap(~ Measure, scales = "free")

3.7.2 Murphy diagram for elementary score

df_murphy <- df_cali %>% 
  filter(!(model %in% c("trivial", "ols"))) %>% 
  group_by(dataset, model) %>% 
  summarize(
    murphy_diagram(
      cur_data()[[y_var]], 
      cur_data()$prediction, 
      plot = FALSE, 
      theta = seq(0, 1, by = 0.02)
    ),
    .groups = "drop"
  ) %>% 
  rename(Score = predicted)

# Vertical line is the mean y on the training data (for both graphs)
ggplot(df_murphy, aes(y = Score, x = theta)) +
  geom_line(aes(color = model, linetype = model, group = model), size = 0.75) +
  geom_vline(xintercept = mean(train[[y_var]]), linetype = 2, size = 0.75) +
  facet_wrap(~ dataset, scales = "free", ncol = 2) +
  theme(legend.position = c(0.25, 0.2), legend.title = element_blank())

4 Session Info

The html is generated with the follow packages (slightly newer than the ones used in the published tutorial).

sessionInfo()
## R version 4.0.4 (2021-02-15)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] splines   stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] reliabilitydiag_0.2.0 splitTools_0.3.1      ranger_0.12.1        
##  [4] OpenML_1.10           withr_2.4.2           glm2_1.2.1           
##  [7] scales_1.1.1          patchwork_1.1.1       ggcorrplot_0.1.3     
## [10] arrow_4.0.1           xgboost_1.4.1.1       MetricsWeighted_0.5.3
## [13] hexbin_1.28.2         lubridate_1.7.10      forcats_0.5.1        
## [16] stringr_1.4.0         dplyr_1.0.6           purrr_0.3.4          
## [19] readr_1.4.0           tidyr_1.1.3           tibble_3.1.2         
## [22] ggplot2_3.3.3         tidyverse_1.3.1      
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-152      fs_1.5.0          bit64_4.0.5       httr_1.4.2       
##  [5] tools_4.0.4       backports_1.2.1   bslib_0.3.1       utf8_1.2.1       
##  [9] R6_2.5.0          DBI_1.1.1         mgcv_1.8-36       colorspace_2.0-1 
## [13] tidyselect_1.1.1  bit_4.0.4         curl_4.3.1        compiler_4.0.4   
## [17] cli_2.5.0         rvest_1.0.0       xml2_1.3.2        labeling_0.4.2   
## [21] sass_0.4.0        checkmate_2.0.0   digest_0.6.27     rmarkdown_2.8    
## [25] pkgconfig_2.0.3   htmltools_0.5.2   dbplyr_2.1.1      fastmap_1.1.0    
## [29] highr_0.9         rlang_0.4.11      readxl_1.3.1      rstudioapi_0.13  
## [33] BBmisc_1.11       jquerylib_0.1.4   generics_0.1.0    farver_2.1.0     
## [37] jsonlite_1.7.2    magrittr_2.0.1    Matrix_1.3-4      Rcpp_1.0.7       
## [41] munsell_0.5.0     fansi_0.5.0       lifecycle_1.0.0   stringi_1.6.2    
## [45] yaml_2.2.1        plyr_1.8.6        grid_4.0.4        crayon_1.4.1     
## [49] lattice_0.20-44   haven_2.4.1       hms_1.1.0         knitr_1.33       
## [53] pillar_1.6.1      reshape2_1.4.4    reprex_2.0.0      XML_3.99-0.6     
## [57] glue_1.4.2        evaluate_0.14     data.table_1.14.0 modelr_0.1.8     
## [61] vctrs_0.3.8       cellranger_1.1.0  gtable_0.3.0      assertthat_0.2.1 
## [65] cachem_1.0.5      xfun_0.23         broom_0.7.7       farff_1.1.1      
## [69] viridisLite_0.4.0 memoise_2.0.0     ellipsis_0.3.2